Introduction

Let us first start by installing and loading the packages we will use during this whole assignment.

i.p <- (\(x) is.null(install.packages(x, repos = "https://cloud.r-project.org/")))

# gsub on files
require(xfun)          || i.p("xfun")          && require(xfun)

# Tibbles and plots
require(tidyverse)     || i.p("tidyverse")     && require(tidyverse)
require(patchwork)     || i.p("patchwork")     && require(patchwork)

# Tsibbles and models for them
require(tsibble)       || i.p("tsibble")       && require(tsibble)
require(imputeTS)      || i.p("imputeTS")      && require(imputeTS)
require(feasts)        || i.p("feasts")        && require(feasts)
require(fable)         || i.p("fable")         && require(fable)
require(fable.prophet) || i.p("fable.prophet") && require(fable.prophet)
require(fabletools)    || i.p("fabletools")    && require(fabletools)

# Machine Learning
require(modeltime)     || i.p("modeltime")     && require(modeltime)
require(timetk)        || i.p("timetk")        && require(timetk)
require(rsample)       || i.p("rsample")       && require(rsample)
require(parsnip)       || i.p("parsnip")       && require(parsnip)

# The looks
require(reactablefmtr) || i.p("reactablefmtr") && require(reactablefmtr)

# Plot theme
theme_set(theme_minimal())

# English for dates
Sys.setlocale(locale = "en_GB.UTF-8")

# Useful functions
most_freq <- (\(x) as.numeric(names(which.max(table(x)))))

zip <- function(...) {
    mapply(list, ..., SIMPLIFY = FALSE)
}

enumerate <- function(...) {
    zip(ix = seq_along(..1), ...)
}

# Plot functions
colors <- c("CO"    = "#CD3333",
            "C6H6"  = "#EE7621",
            "NOy"   = "#EEC900",
            "NO2"   = "#43CD80",
            "Temp"  = "#009ACD",
            "TempM" = "#8B008B")

pretty_ts  <- function(.data, .var, title, type = "histogram", lag_max = NULL) {
    norm <- sqrt(nrow(.data))
    p1   <- .data %>%
        autoplot(.data[[.var]], color = colors[.var]) +
            labs(title = title, x = "", y = "")
    p2   <- .data %>%
        ACF(.data[[.var]], lag_max = lag_max) %>% 
        ggplot(aes(x = lag, y = acf)) +
            geom_hline(aes(yintercept = 0)) +
            geom_segment(mapping = aes(xend = lag, yend = 0),
                         color = colors[.var], alpha = 0.75) +
            geom_hline(aes(yintercept =  1.96/norm),
                       linetype = 2, color = "#838B8B") +
            geom_hline(aes(yintercept = -1.96/norm),
                       linetype = 2, color = "#838B8B") +
            labs(title = "", x = "", y = "ACF")
    if (type == "histogram") {
        p3 <- .data %>% 
            ggplot(aes(x = .data[[.var]])) +
            geom_histogram(aes(y = after_stat(density)),
                           color = "white", fill = colors[.var], alpha = 0.25) +
            geom_density(lwd = 1, color = colors[.var]) +
            labs(title = "", x = "", y = "Density")
    } else {
        p3 <- .data %>%
            PACF(.data[[.var]], lag_max = lag_max) %>% 
            ggplot(aes(x = lag, y = pacf)) +
                geom_hline(aes(yintercept = 0)) +
                geom_segment(mapping = aes(xend = lag, yend = 0),
                             color = colors[.var], alpha = 0.75) +
                geom_hline(aes(yintercept =  1.96/norm),
                           linetype = 2, color = "#838B8B") +
                geom_hline(aes(yintercept = -1.96/norm),
                           linetype = 2, color = "#838B8B") +
                labs(title = "", x = "", y = "PACF")
    }
    
    print(p1 / (p2 | p3))
}

pretty_resid  <- function(.data, .var, title, lag_max = NULL) {
    norm <- sqrt(nrow(.data))
    p1   <- .data %>%
        autoplot(.data[[".resid"]], color = colors[.var]) +
            labs(title = title, x = "", y = "")
    p2   <- .data %>%
        ACF(.data[[".resid"]], lag_max = lag_max) %>% 
        ggplot(aes(x = lag, y = acf)) +
            geom_hline(aes(yintercept = 0)) +
            geom_segment(mapping = aes(xend = lag, yend = 0),
                         color = colors[.var], alpha = 0.75) +
            geom_hline(aes(yintercept =  1.96/norm),
                       linetype = 2, color = "#838B8B") +
            geom_hline(aes(yintercept = -1.96/norm),
                       linetype = 2, color = "#838B8B") +
            labs(title = "", x = "", y = "ACF")
    p3   <- .data %>% 
        ggplot(aes(x = .data[[".resid"]])) +
        geom_histogram(aes(y = after_stat(density)),
                       color = "white", fill = colors[.var], alpha = 0.25) +
        geom_density(lwd = 1, color = colors[.var]) +
        labs(title = "", x = "", y = "Density")
    
    print(p1 / (p2 | p3))
}

compare_box_cox <- function(.data, .var, title) {
    .data$bc <- box_cox_vec(.data[[.var]], silent = TRUE)
    stl_nm   <- stl(as.ts(.data)[, .var], 24)$time.series[, "trend"] %>% as_tibble()
    stl_bc   <- stl(as.ts(.data)[, "bc"], 24)$time.series[, "trend"] %>% as_tibble()
    stl_nm$index <- .data$Date
    stl_bc$index <- .data$Date
    stl_nm   <- stl_nm %>% as_tsibble(index = index)
    stl_bc   <- stl_bc %>% as_tsibble(index = index)
    p1 <- .data %>% 
        autoplot(.data[[.var]], color = colors[.var], alpha = 0.25) +
            autolayer(stl_nm, x, color = colors[.var]) +
            labs(title = title, subtitle = "No transformation applied", x = "", y = "")
    p2 <- .data %>% 
        autoplot(.data[["bc"]], color = colors[.var], alpha = 0.25) +
            autolayer(stl_bc, x, color = colors[.var]) +
            labs(title = "", subtitle = "Box-Cox transformed", x = "", y = "")
    
    print(p1 | p2)
}

The presented dataset contains information pertaining to temperature measurements, alongside the levels of various atmospheric pollutants, obtained from a monitoring station located at street level within an urban area of Italy. This data has been sourced from the UCI Machine Learning Repository.

The dataset encompasses hourly recordings spanning a period of one year. The objective of utilizing this data is to develop a forecast model capable of predicting the concentration levels of these same pollutants for a period of six months into the future, with the goal of gauging their potential impact on ambient temperature conditions.

Getting the data

The presented code serves the purpose of extracting and retrieving the relevant data, in addition to implementing preliminary transformations that aid in parsing the input effectively. Specifically, the code is designed to fetch the data and perform basic manipulations to conform to the CSV standard for efficient processing.

download_url <- paste0("https://archive.ics.uci.edu/ml/",
                       "machine-learning-databases/00360/AirQualityUCI.zip")

if (!file.exists("AirQualityUCI.zip")) {
    download.file(download_url, "./AirQualityUCI.zip")
}
if (!file.exists("AirQualityUCI.csv")) {
    unzip("AirQualityUCI.zip")
    gsub_file("AirQualityUCI.csv", ",",  ".")
    gsub_file("AirQualityUCI.csv", ";;", "")
    gsub_file("AirQualityUCI.csv", ";",  ",")
}

data <- read_csv("AirQualityUCI.csv", show_col_types = FALSE)

Preprocessing

Our initial step involves the creation of a variable that shall serve as the temporal index for the time series. This index shall comprise the Date and Time variables, which shall be combined and subsequently transformed to adhere to the “UTC+1” 1 2 format. Additionally, we shall include within the pipeline the substitution of any instances of -200 with NA, as specified in the dataset’s source webpage.

It is pertinent to note that while these NA values shall be inputted at a later stage, they shall not be introduced at this time. Finally, we shall transform the resultant data into a tsibble object to facilitate future analysis and visualization.

data_ts <- data %>%
    add_column(Datetime = as.POSIXct(paste(data$Date, data$Time), 
                                     format = "%d/%m/%Y %H.%M.%S",
                                     tz = "GMT"), .before = 1) %>% 
    select(!c(Date, Time)) %>% 
    replace(. == -200, NA) %>% 
    as_tsibble(index = Datetime)

Next, we shall select the time series that align with our research interests. To accomplish this task, we shall reference the available information pertaining to the distinct time series present within the dataset and extract those that are sourced from the “reference analyser,” along with the temperature measurements.

Subsequently, we shall narrow our focus to these selected time series for the duration of the assignment. By isolating these specific variables, we can limit our analysis to only the most pertinent data, thereby enabling us to derive more precise insights and outcomes.

data_ts <- data_ts %>% 
    select(c("Datetime", "T", ends_with("(GT)")))

Subsequently, we shall analyse the distribution of missing values within the dataset. Upon examination, we observe that for the variable NMHC(GT), the vast majority of the available data is, in fact, composed of missing values.

Consequently, we have opted to remove this particular variable from consideration and instead focus on the remaining variables that possess a manageable quantity of missing values that we can address. By removing variables that are predominantly composed of missing values, we can limit the extent to which these values may distort our findings and derive more accurate insights from the dataset.

data_ts %>% is.na() %>% colSums()
## Datetime        T   CO(GT) NMHC(GT) C6H6(GT)  NOx(GT)  NO2(GT) 
##        0      366     1683     8443      366     1639     1642
data_ts <-  data_ts %>% select(!"NMHC(GT)")

# Rename for convenience
colnames(data_ts)    <- gsub("[(]GT[)]", "", colnames(data_ts))
colnames(data_ts)[1] <- "Date"
colnames(data_ts)[2] <- "Temp"

Descriptive Analysis

Our subsequent step involves visualizing all the selected variables. Upon analysing the temperature measurements, we observe the anticipated oscillatory behaviour that aligns with an annual period. Furthermore, upon examining the Autocorrelation Function (ACF) plot, we can discern a daily seasonality pattern that is expected from temperature measurements.

These findings are in line with the anticipated temperature trends, where the temperature levels tend to rise and fall periodically throughout the year, with daily fluctuations arising as a result of diurnal changes in temperature.

data_ts %>% pretty_ts("Temp", title = "Temperature [°C]")

Upon analysing the CO concentrations, we note a distinct pattern compared to the temperature measurements. Firstly, we observe a significant presence of missing values in the time series plot, which appear to be clustered in patches. We suspect that these missing values may arise due to sensor shutdowns during specific time periods.

Additionally, we observe a notable decrease in CO concentration during the summer season, likely due to the reduced use of heating devices owing to warmer weather conditions in the analysed region.

Upon examining the ACF plot, we observe a semi-daily period, which may arise from the regular switching on and off of heating devices, one of the main sources of CO emissions. Specifically, we observe this switching to occur during the morning and late afternoon. However, the seasonality of this trend appears to be weaker than that observed in the case of temperature, indicating that CO concentrations are influenced by a more diverse range of factors beyond just heating devices.

data_ts %>% pretty_ts("CO", title = "CO [mg/m³]")

Benzene (\(C_6H_6\)) emit channels are a subset of the carbon monoxide emit channels. This is evident when observing the plots for these two variables, which exhibit a very similar behaviour in their respective time series. However, upon examining the ACF plot, we note that the seasonality for Benzene is only daily and with a much smaller trend component, unlike that observed in the case of carbon monoxide.

We speculate that the reason for this difference may arise from the reduced number of emit channels for Benzene compared to carbon monoxide. As a result, the factors contributing to Benzene concentrations are likely to be more localized and less diverse, leading to a reduced trend component and a more distinct daily seasonality pattern.

data_ts %>% pretty_ts("C6H6", title = "Benzene [μg/m³]")

We are dealing with two columns that represent the levels of nitrogen oxides in the air, namely \(NO_x\) and \(NO_2\). The former reflects the concentration of all nitrogen oxides, while the latter only accounts for nitrogen dioxide. To separate the contributions of \(NO_x\) and \(NO_2\), we use the conversion formula between parts per billion (ppb) and micrograms per cubic meter (μg/m³). This formula is given by: \[\begin{equation*} C(\mathrm{ppb}) = C(\mu\mathrm{g/m}^3) \cdot \frac{V(\mathrm{air})}{M(NO_2)} \;, \end{equation*}\] where \(M(NO_2)\) refers to the molecular mass of nitrogen dioxide, which is 46.006 g/mol, and \(V(\mathrm{air})\) is the volume of air under consideration. We calculate \(V(\mathrm{air})\) using Gay-Lussac’s law, which involves approximating the air pressure as a constant and assuming that the volume is \(24.45\) litres at a temperature of \(25\) degrees Celsius. By applying this formula, we can decouple the contributions of \(NO_x\) and \(NO_2\) and analyse their effects separately.

data_ts$NOy <- data_ts$NOx - 24.45/46.006*data_ts$NO2 * (273.25 + data_ts$Temp)/298.25

# Delete pathological cases
data_ts$NOy[data_ts$NOy < 0] <- NA

We have renamed this \(NO_2\) stripped variable to NOy for convenience.

Looking at the \(NO_y\) time series plot, we observe a sudden increase at the end of summer, which does not seem to disappear on the following year. Moreover, the ACF plot shows both semi-daily and daily seasonality, with the latter being more prominent.

We may attribute the daily seasonality of \(NO_y\) to the frequency of car pollution, as it is well known that most people travel to work by car or bus. As a result, the traffic congestion during peak hours results in higher pollution levels. Additionally, the semi-daily seasonality may also come from this factor, but it could also be related to other sources of pollution, such as industrial processes, which may follow specific work schedules.

data_ts %>% pretty_ts("NOy", title = "Nitrogen Oxides [ppb]")

On the other hand, \(NO_2\) exhibits a very different pattern, justifying the prior decoupling we made. For once, it does not appear to have sudden changes when in different year epochs like the rest of gases, with a concentration that, at first glance, resembles a kind of stationary signal. Moreover, the seasonality appears to be strictly daily, with a very fast ACF decrease, which indicates absence of trend.

data_ts %>% pretty_ts("NO2", title = "Nitrogen Dioxide [μg/m³]")

Forecasting

In this section, our aim is to forecast both the gases and temperature with a 6-month horizon, assuming a relation between them. The first step is to find suitable models for each gas and forecast them into the future. Then, we will use the gas predictions to construct a model that estimates the temperature using a time series model and the gas forecasts. This will help us understand which gases have a greater impact on the temperature.

Polluting agents

Considering that we are dealing with four distinct time series, each representing a different polluting agent, it is imperative that we adopt a systematic and organized approach to modelling and forecasting. Therefore, we propose the following recipe for this section:

  1. We will assess whether a transformation is appropriate for each agent to normalize the distribution of the values. Given the right-skewed nature of the distributions, a Box-Cox transformation may be beneficial.

  2. We will fit five models for each agent, including an automatic ARIMA, a manual ARIMA, an automatic ETS, a Prophet model, and an automatic TSLM with various Fourier components to capture all possible periodicities. For the manual ARIMA, we will differentiate the series to white noise and interpret the ACF and PACF.

  3. We will test each of these models using 5-fold cross-validation, averaging the precision metrics across the different splits and selecting the model with the smallest MAE.

  4. Finally, the selected model will be used to forecast the next six months’ concentrations for each polluting agent.

In order to ensure a proper evaluation of our models, it is important to first split the dataset into a training and testing partition. The training partition will be used to make decisions and apply cross-validation, while the test partition will be used to evaluate the performance of the final model.

To achieve this, we will allocate 5 times more data to the training set, 10 months, than to the test set, 2 months. We will then perform 5-fold cross-validation, where we will use 5 months for training and 1 month for testing. It is worth noting that the cross-validation splits will be taken from the training partition.

Furthermore, some of the data in the dataset may be missing, represented as NA values. In order to make our models valid, we will impute these missing values using linear regression on the lagged values.

Once these steps have been completed, we can proceed to modelling and forecasting the polluting agents and temperature data.

# Input missing values
data_ts <- data_ts %>% na_interpolation()

# Duplicate Temp variable for later convenience
data_ts <- data_ts %>% mutate(TempM = Temp)

# Train-test split and CV splits
traintest <- data_ts %>%
    as_tibble() %>% 
    time_series_split(date_var = Date, initial = "10 months", assess = "2 months")
cvsplits  <- training(traintest) %>% 
    time_series_cv(date_var    = Date,
                   initial     = "5 months",
                   assess      = "1 month",
                   skip        = "1 month",
                   slice_limit = 5,
                   cumulative  = FALSE)

# Convert to tsibble for convenience
train_ts <- training(traintest) %>% as_tsibble(index = Date)
test_ts  <- testing(traintest)  %>% as_tsibble(index = Date)

Carbon Monoxide

First of all, we are going to consider a Box-Cox transformation to get a more normal distribution of the values, reducing skewness and heteroskedasticity. By looking at the following plots it does indeed appear that this transformation helps in getting a time series with which we may work more easily.

train_ts %>% compare_box_cox("CO", "CO atmosferic concentration [mg/m³]")

Now we need to examine the time series to create an appropriate ARIMA model. For this purpose, we are going to apply a 24-hour differentiation, which we decide upon by looking at the next graph. Then, we check and see using unitroot tests that a normal difference is not needed for the time series to reach a white-noise behaviour.

train_ts %>%
    mutate(CO = box_cox_vec(CO, silent = TRUE)) %>% 
    pretty_ts("CO", "Box-Cox transformed CO", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_CO  = box_cox_vec(CO, silent = TRUE),
           dBC_CO = difference(BC_CO, 24)) %>% 
    features(dBC_CO, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(CO = difference(box_cox_vec(CO, silent = TRUE), 24)) %>% 
    pretty_ts("CO", "Diff_24 Box-Cox transformed CO", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(3,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 3
# Look at PACF_24 => P = 3
summary_CO <- list()
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$CO)
    summary_CO[[sp$ix]] <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(CO, lam) ~ 1 + pdq(3, 0, 0) + 
                                     PDQ(3, 1, 0, period = 24), method = "CSS"),
            auto_arima   = ARIMA(box_cox(CO, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(CO, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(CO, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(CO, lam) ~ trend() +
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
}

models_CO <- bind_rows(summary_CO) %>%
    select(!.type) %>%
    group_by(.model) %>%
    summarise_all(mean, na.rm = TRUE)

models_CO %>% 
    mutate(.model_id = seq_len(nrow(.)), .before = .model) %>% 
    rename(.model_desc = .model) %>% 
    relocate(c(MAE, MAPE, MASE, RMSE), .after = .model_desc) %>% 
    select(!c(ME, MPE, RMSSE,  ACF1)) %>% 
    table_modeltime_accuracy(theme = cosmo(), showSortIcon = FALSE, .searchable = FALSE)
print(paste("The best model for Carbon Monoxide is",
            models_CO$.model[which.min(models_CO$MAE)]))
## [1] "The best model for Carbon Monoxide is prophet"

We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. We also visualize its performance on the test partition which captures the tendency and with the confidence intervals offers a good approximation on the range where real values are measured. It fails nevertheless to reproduce the correct oscillatory movement.

lam_CO   <- auto_lambda(train_ts$CO)
model_CO <- train_ts %>%
    model(prophet(box_cox(CO, lam_CO) ~ season(period = 24)))

model_CO %>%
    residuals() %>%
    pretty_resid("CO", title = "Residuals for Carbon Monoxide [mg/m³]")

f_CO <- forecast(model_CO, test_ts)
train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(CO) +
        autolayer(f_CO, CO, color = colors["CO"]) +
        autolayer(test_ts, CO, color = "#838B8B", alpha = 0.75) +
        labs(x = "", y = "Carbon Monoxide [mg/m³]",
             title = paste("MAE =", round(accuracy(f_CO, data_ts)$MAE, 4)))

Benzene

As with CO, we are going to consider a Box-Cox transformation for the same reasons. By looking at the following plots it does indeed appear that this transformation helps in getting a time series with which we may work more easily.

train_ts %>% compare_box_cox("C6H6", "Benzene atmosferic concentration [μg/m³]")

Looking at previous ACF, we are going to apply a 24-hour differentiation. Then, we check and see using unitroot tests that a normal difference is not needed for the time series to reach a white-noise behaviour.

train_ts %>%
    mutate(C6H6 = box_cox_vec(C6H6, silent = TRUE)) %>% 
    pretty_ts("C6H6", "Box-Cox transformed Benzene", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_C6H6  = box_cox_vec(C6H6, silent = TRUE),
           dBC_C6H6 = difference(BC_C6H6, 24)) %>% 
    features(dBC_C6H6, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(C6H6 = difference(box_cox_vec(C6H6, silent = TRUE), 24)) %>% 
    pretty_ts("C6H6", "Diff_24 Box-Cox transformed Benzene", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS Prophet and TSLM.

# Look at PACF    => p = 2
# Look at PACF_24 => P = 3
summary_C6H6 <- list()
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$C6H6)
    summary_C6H6[[sp$ix]] <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(C6H6, lam) ~ 1 + pdq(2, 0, 0) +
                                     PDQ(3, 1, 0, period = 24), method = "CSS"),
            auto_arima   = ARIMA(box_cox(C6H6, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(C6H6, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(C6H6, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(C6H6, lam) ~ trend() +
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
}

models_C6H6 <- bind_rows(summary_C6H6) %>%
    select(!.type) %>%
    group_by(.model) %>%
    summarise_all(mean, na.rm = TRUE)

models_C6H6 %>% 
    mutate(.model_id = seq_len(nrow(.)), .before = .model) %>% 
    rename(.model_desc = .model) %>% 
    relocate(c(MAE, MAPE, MASE, RMSE), .after = .model_desc) %>% 
    select(!c(ME, MPE, RMSSE,  ACF1)) %>% 
    table_modeltime_accuracy(theme = cosmo(), showSortIcon = FALSE, .searchable = FALSE)
print(paste("The best model for Benzene is",
            models_C6H6$.model[which.min(models_C6H6$MAE)]))
## [1] "The best model for Benzene is prophet"

We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. As with CO, the forecast captures the tendency but not the correct oscillatory behaviour

lam_C6H6   <- auto_lambda(train_ts$C6H6)
model_C6H6 <- train_ts %>%
    model(prophet(box_cox(C6H6, lam_C6H6) ~ season(period = 24)))

model_CO %>%
    residuals() %>%
    pretty_resid("C6H6", title = "Residuals for Benzene [μg/m³]")

f_C6H6 <- forecast(model_C6H6, test_ts)
train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(C6H6) +
        autolayer(f_C6H6, C6H6, color = colors["C6H6"]) +
        autolayer(test_ts, C6H6, color = "#838B8B", alpha = 0.75) +
        labs(x = "", y = "Benzene [μg/m³]",
             title = paste("MAE =", round(accuracy(f_C6H6, data_ts)$MAE, 4)))

Nitrogen Oxides

This may be the variable with more heteroskedasticity among the batch. Hence we Box-Cox transform in an attempt to at least mitigate some of the unevenness. In contrast with the previous gases, here the utility of the transformation is critical in getting a somewhat homoskedastic time series.

train_ts %>% compare_box_cox("NOy", "Nitrogen Oxides atmosferic concentration [ppb]")

If we plot the ACF of the transformed data, we see justification for the seasonal differentiation.

train_ts %>%
    mutate(NOy = box_cox_vec(NOy, silent = TRUE)) %>% 
    pretty_ts("NOy", "Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_NOy  = box_cox_vec(NOy, silent = TRUE),
           dBC_NOy = difference(BC_NOy, 24)) %>% 
    features(dBC_NOy, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(NOy = difference(box_cox_vec(NOy, silent = TRUE), 24)) %>% 
    pretty_ts("NOy", "Diff_24 Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(3,0,0)(3,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 3
# Look at PACF_34 => P = 3
summary_NOy <- list()
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$NOy)
    summary_NOy[[sp$ix]] <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(NOy, lam) ~ 0 + pdq(3, 0, 0) +
                                     PDQ(3, 1, 0, period = 24), method = "CSS"),
            auto_arima   = ARIMA(box_cox(NOy, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(NOy, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(NOy, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(NOy, lam) ~ trend() +
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
}

models_NOy <- bind_rows(summary_NOy) %>%
    select(!.type) %>%
    group_by(.model) %>%
    summarise_all(mean, na.rm = TRUE)

models_NOy %>% 
    mutate(.model_id = seq_len(nrow(.)), .before = .model) %>% 
    rename(.model_desc = .model) %>% 
    relocate(c(MAE, MAPE, MASE, RMSE), .after = .model_desc) %>% 
    select(!c(ME, MPE, RMSSE,  ACF1)) %>% 
    table_modeltime_accuracy(theme = cosmo(), showSortIcon = FALSE, .searchable = FALSE)
print(paste("The best model for Nitrogen Oxides is",
            models_NOy$.model[which.min(models_NOy$MAE)]))
## [1] "The best model for Nitrogen Oxides is prophet"

We see that the residuals follow a normal distribution and a white-noise like waveform. Nevertheless, the ACF does still not resemble a white-noise ACF. On the performance in the test partition, the same conclusions extracted before apply here.

lam_NOy   <- auto_lambda(train_ts$NOy)
model_NOy <- train_ts %>%
    model(prophet(box_cox(NOy, lam_NOy) ~ season(period = 24)))

model_NOy %>%
    residuals() %>%
    pretty_resid("NOy", title = "Residuals for Nitrogen Oxides [ppb]")

f_NOy <- forecast(model_NOy, test_ts)
train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(NOy) +
        autolayer(f_NOy, NOy, color = colors["NOy"]) +
        autolayer(test_ts, NOy, color = "#838B8B", alpha = 0.75) +
        labs(x = "", y = "Nitrogen Oxides [ppb]",
             title = paste("MAE =", round(accuracy(f_NOy, data_ts)$MAE, 4)))

Nitrogen Dioxide

Different to the rest of the polluting agents, this one does not seem to require of a transformation. However, this transformation will ensure the positivity of all predicted values in the future, which is desirable. Hence we apply it here also.

train_ts %>% compare_box_cox("NO2", "Nitrogen Dioxide atmosferic concentration [μg/m³]")

Looking at the ACF in the previous section, we note that there is a strong daily seasonality. Hence we differentiate with 24 lags.

If we plot the ACF of the transformed data, we see justification for the seasonal differentiation.

train_ts %>%
    mutate(NO2 = box_cox_vec(NO2, silent = TRUE)) %>% 
    pretty_ts("NO2", "Box-Cox transformed Nitrogen Oxides", type = "partial", lag_max = 72)

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(BC_NO2  = box_cox_vec(NO2, silent = TRUE),
           dBC_NO2 = difference(BC_NO2, 24)) %>% 
    features(dBC_NO2, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>%
    mutate(NO2 = difference(box_cox_vec(NO2, silent = TRUE), 24)) %>% 
    pretty_ts("NO2", "Diff_24 Box-Cox transformed Nitrogen Dioxide",
              type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(0,1,3)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 1
# Look at ACF_24  => Q = 3
summary_NO2 <- list()
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)

    lam <- auto_lambda(trndat$NO2)
    summary_NO2[[sp$ix]] <- trndat %>% 
        model(
            manual_arima = ARIMA(box_cox(NO2, lam) ~ 1 + pdq(2, 0, 0) +
                                     PDQ(0, 1, 3, period = 24), method = "CSS"),
            auto_arima   = ARIMA(box_cox(NO2, lam) ~ PDQ(period = 24)),
            prophet      = prophet(box_cox(NO2, lam) ~ season(period = 24)),
            auto_ets     = ETS(box_cox(NO2, lam) ~ season(period = 24)),
            auto_tslm    = TSLM(box_cox(NO2, lam) ~ trend() + 
                                    fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
}

models_NO2 <- bind_rows(summary_NO2) %>%
    select(!.type) %>%
    group_by(.model) %>%
    summarise_all(mean, na.rm = TRUE)


models_NO2 %>% 
    mutate(.model_id = seq_len(nrow(.)), .before = .model) %>% 
    rename(.model_desc = .model) %>% 
    relocate(c(MAE, MAPE, MASE, RMSE), .after = .model_desc) %>% 
    select(!c(ME, MPE, RMSSE,  ACF1)) %>% 
    table_modeltime_accuracy(theme = cosmo(), showSortIcon = FALSE, .searchable = FALSE)
print(paste("The best model for Nitrogen Dioxides is",
            models_NO2$.model[which.min(models_NO2$MAE)]))
## [1] "The best model for Nitrogen Dioxides is manual_arima"

We see that the residuals follow a normal distribution and a white-noise like waveform. In contrast to previous series, here the ACF does resemble much closer the one expected from a white-noise signal. When comparing with the test partition, we see very reasonable confidence intervals which result in a very acceptable prediction of the trend and the possible range of values.

lam_NO2   <- auto_lambda(train_ts$NO2)
model_NO2 <- train_ts %>%
    model(ARIMA(box_cox(NO2, lam) ~ 1 + pdq(2, 0, 0) +
                    PDQ(0, 1, 3, period = 24), method = "CSS"))

model_NO2 %>%
    residuals() %>%
    pretty_resid("NO2", title = "Residuals for Nitrogen Dioxide [μg/m³]")

f_NO2 <- forecast(model_NO2, test_ts)
train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(NO2) +
        autolayer(f_NO2, NO2, color = colors["NO2"]) +
        autolayer(test_ts, NO2, color = "#838B8B", alpha = 0.75) +
        labs(x = "", y = "Nitrogen Dioxide [μg/m³]",
             title = paste("MAE =", round(accuracy(f_NO2, data_ts)$MAE, 4)))

We observe that for all the polluting agents the best model overall was Prophet except for NO2 where our manual ARIMA performed best. We will use them in the following section to forecast the concentration of the various gases in order to test their effect on the temperature. For now, let us move on to temperature and how a univariate and multivariate approach on its forecast may behave.

Temperature

Now we are tasked to find a good model to describe the temperature as a function of the different polluting agents. We will attempt this by first finding a suitable model for the temperature just as we did for the gases. Then, we will play with the number of predictors to include in the model. At last, we will consider the 6-month forecast as we did for pollutants.

Univariate model

Let us start by finding the model. We do not believe that a Box-Cox transformation is needed given the apparent homoskedasticity of the data. Now we examine the time series to create an appropriate ARIMA model. For this purpose, we are going to apply a 24-hour differentiation. Then, we check and see using unitroot tests that a normal difference is not needed.

# Let's differentiate seasonably. Then check if more differences are needed
train_ts %>% 
    mutate(dTemp = difference(Temp, 24)) %>% 
    features(dTemp, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      0
# Difference and see if stationary
train_ts %>% 
    mutate(Temp = difference(Temp, 24)) %>% 
    pretty_ts("Temp", "Diff_24 Temperature", type = "partial", lag_max = 72)

Looking at the ACF and PACF we decide that an \(ARIMA(2,0,0)(2,1,0)[24]\) is the optimal choice. Having that in mind, we proceed to test this model along with the rest of the considered: auto ARIMA, auto ETS, Prophet and TSLM.

# Look at PACF    => p = 2
# Look at PACF_24 => P = 2
summary_Temp <- list()
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)
    
    summary_Temp[[sp$ix]] <- trndat %>% 
        model(
            manual_arima = ARIMA(Temp ~ 1 + pdq(2, 0, 0) +
                                     PDQ(2, 1, 0, period = 24), method = "CSS"),
            auto_arima   = ARIMA(Temp ~ PDQ(period = 24)),
            prophet      = prophet(Temp ~ season(period = 24)),
            auto_ets     = ETS(Temp ~ season(period = 24)),
            auto_tslm    = TSLM(Temp ~ trend() + fourier(period = "1 day",   K = 3) +
                                    fourier(period = "1 year",  K = 3))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
}

models_Temp <- bind_rows(summary_Temp) %>%
    select(!.type) %>%
    group_by(.model) %>%
    summarise_all(mean, na.rm = TRUE)

models_Temp %>% 
    mutate(.model_id = seq_len(nrow(.)), .before = .model) %>% 
    rename(.model_desc = .model) %>% 
    relocate(c(MAE, MAPE, MASE, RMSE), .after = .model_desc) %>% 
    select(!c(ME, MPE, RMSSE,  ACF1)) %>% 
    table_modeltime_accuracy(theme = cosmo(), showSortIcon = FALSE, .searchable = FALSE)
print(paste("The best model for Temperature is",
            models_Temp$.model[which.min(models_Temp$MAE)]))
## [1] "The best model for Temperature is prophet"

Let us proceed as usual and look at residuals and performance in the test partition. Residuals exhibit the same behaviour as we noted with the polluting agents, with a normal distribution zero-centred but very high correlations as noted in the ACF. For the performance in the test partition, we note a very poor prediction which, although not bad for the first month, quickly starts falling below the actual recorded data. This is a sign that the model is not able to see the whole yearly seasonality, which is not too rare considering that we only have one full year of data, hence less in the train partition.

model_Temp <- train_ts %>%
    model(prophet(Temp ~ season(period = 24)))

model_Temp %>%
    residuals() %>%
    pretty_resid("Temp", title = "Residuals for Temperature")

f_Temp <- forecast(model_Temp, test_ts)
train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(Temp) +
        autolayer(f_Temp, Temp, color = colors["Temp"]) +
        autolayer(test_ts, Temp, color = "#838B8B", alpha = 0.75) +
        labs(x = "", y = "Temperature [°C]",
             title = paste("MAE =", round(accuracy(f_Temp, data_ts)$MAE, 4)))

Without further ado, let us move directly to the last point of this whole section, the multivariate forecast on the temperature.

Multivariate model

We are now going to consider adding the gases to the model of the temperature. For this, we will compare two models which work very well and with good stability when adding predictors 1: TSLM and VAR. Note that the VAR model is able to predict not only for the temperature but also for all the agents at once. Nonetheless, we are interested in the temperature as we expect the presence of one concrete agent not to have a big impact on another 2. Hence we will only compare the performance for the temperature.

Let us start by determining the TSLM to compare by engaging in a manual iterative process removing predictors which does not happen to be related with the temperature. We consider in first approximation all the polluting agents at the same moment and with one day retarded effect.

# All
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 3) + 
                       CO + C6H6 + NOy + NO2 +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NOy, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13.64711  -2.05359   0.04705   2.23013  10.75562 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              26.8756321  0.6973758  38.538  < 2e-16
## trend()                                  -0.0024994  0.0001828 -13.676  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.6329808  0.0582719 -45.184  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.9795825  0.0639462 -46.595  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     0.9955196  0.0551797  18.041  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.1290254  0.0586948  19.236  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -0.2659150  0.0543833  -4.890 1.03e-06
## fourier(period = "1 day", K = 3)S3_24     0.1518243  0.0548820   2.766  0.00568
## fourier(period = "1 year", K = 3)C1_8766 -2.7659964  0.4318835  -6.404 1.60e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.0951848  0.2413665 -33.539  < 2e-16
## fourier(period = "1 year", K = 3)C2_8766  2.9389645  0.1745945  16.833  < 2e-16
## fourier(period = "1 year", K = 3)S2_8766  2.0376329  0.1327419  15.350  < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0753438  0.0706437  -1.067  0.28622
## fourier(period = "1 year", K = 3)S3_8766  0.8767532  0.1068235   8.207 2.65e-16
## CO                                        0.1127158  0.0631113   1.786  0.07414
## C6H6                                      0.1817264  0.0102734  17.689  < 2e-16
## NOy                                      -0.0080657  0.0005180 -15.570  < 2e-16
## NO2                                      -0.0022842  0.0018561  -1.231  0.21851
## lag(CO, 24)                               0.0647771  0.0633347   1.023  0.30645
## lag(C6H6, 24)                             0.0434672  0.0103035   4.219 2.49e-05
## lag(NOy, 24)                             -0.0004133  0.0005177  -0.798  0.42469
## lag(NO2, 24)                             -0.0027536  0.0018548  -1.485  0.13771
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    ** 
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766    
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO                                       .  
## C6H6                                     ***
## NOy                                      ***
## NO2                                         
## lag(CO, 24)                                 
## lag(C6H6, 24)                            ***
## lag(NOy, 24)                                
## lag(NO2, 24)                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.253 on 7298 degrees of freedom
## Multiple R-squared: 0.8595,  Adjusted R-squared: 0.8591
## F-statistic:  2126 on 21 and 7298 DF, p-value: < 2.22e-16
# NO2 out
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 3) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NOy, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13.68079  -2.04863   0.05124   2.22714  10.76615 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              26.8067051  0.6951474  38.563  < 2e-16
## trend()                                  -0.0025038  0.0001827 -13.702  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.6250804  0.0579192 -45.323  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.9650360  0.0628464 -47.179  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     0.9919768  0.0551065  18.001  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.1350929  0.0584894  19.407  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -0.2668438  0.0543799  -4.907 9.45e-07
## fourier(period = "1 day", K = 3)S3_24     0.1493630  0.0548475   2.723  0.00648
## fourier(period = "1 year", K = 3)C1_8766 -2.7653466  0.4318984  -6.403 1.62e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.1140507  0.2408876 -33.684  < 2e-16
## fourier(period = "1 year", K = 3)C2_8766  2.9311626  0.1744855  16.799  < 2e-16
## fourier(period = "1 year", K = 3)S2_8766  2.0283642  0.1325328  15.305  < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0714687  0.0705759  -1.013  0.31126
## fourier(period = "1 year", K = 3)S3_8766  0.8754202  0.1068218   8.195 2.93e-16
## CO                                        0.0891488  0.0601377   1.482  0.13827
## C6H6                                      0.1802612  0.0102045  17.665  < 2e-16
## NOy                                      -0.0082212  0.0005024 -16.364  < 2e-16
## lag(CO, 24)                               0.0748304  0.0628078   1.191  0.23353
## lag(C6H6, 24)                             0.0447335  0.0102523   4.363 1.30e-05
## lag(NOy, 24)                             -0.0003454  0.0005148  -0.671  0.50234
## lag(NO2, 24)                             -0.0037966  0.0016499  -2.301  0.02141
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    ** 
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766    
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO                                          
## C6H6                                     ***
## NOy                                      ***
## lag(CO, 24)                                 
## lag(C6H6, 24)                            ***
## lag(NOy, 24)                                
## lag(NO2, 24)                             *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.253 on 7299 degrees of freedom
## Multiple R-squared: 0.8595,  Adjusted R-squared: 0.8591
## F-statistic:  2232 on 20 and 7299 DF, p-value: < 2.22e-16
# lag NOy out
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 3) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13.67092  -2.03969   0.04897   2.22801  10.78007 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              26.8431154  0.6929993  38.735  < 2e-16
## trend()                                  -0.0025079  0.0001826 -13.733  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.6252054  0.0579167 -45.327  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.9722014  0.0619297 -47.993  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     0.9906763  0.0550703  17.989  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.1334413  0.0584353  19.397  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -0.2662715  0.0543712  -4.897 9.93e-07
## fourier(period = "1 day", K = 3)S3_24     0.1503065  0.0548274   2.741  0.00613
## fourier(period = "1 year", K = 3)C1_8766 -2.7747835  0.4316530  -6.428 1.37e-10
## fourier(period = "1 year", K = 3)S1_8766 -8.1077536  0.2406956 -33.685  < 2e-16
## fourier(period = "1 year", K = 3)C2_8766  2.9341080  0.1744237  16.822  < 2e-16
## fourier(period = "1 year", K = 3)S2_8766  2.0342100  0.1322410  15.383  < 2e-16
## fourier(period = "1 year", K = 3)C3_8766 -0.0719263  0.0705700  -1.019  0.30813
## fourier(period = "1 year", K = 3)S3_8766  0.8762058  0.1068113   8.203 2.74e-16
## CO                                        0.0973653  0.0588750   1.654  0.09822
## C6H6                                      0.1811190  0.0101237  17.891  < 2e-16
## NOy                                      -0.0083555  0.0004608 -18.133  < 2e-16
## lag(CO, 24)                               0.0564447  0.0565108   0.999  0.31791
## lag(C6H6, 24)                             0.0428147  0.0098449   4.349 1.39e-05
## lag(NO2, 24)                             -0.0040458  0.0016074  -2.517  0.01186
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    ** 
## fourier(period = "1 year", K = 3)C1_8766 ***
## fourier(period = "1 year", K = 3)S1_8766 ***
## fourier(period = "1 year", K = 3)C2_8766 ***
## fourier(period = "1 year", K = 3)S2_8766 ***
## fourier(period = "1 year", K = 3)C3_8766    
## fourier(period = "1 year", K = 3)S3_8766 ***
## CO                                       .  
## C6H6                                     ***
## NOy                                      ***
## lag(CO, 24)                                 
## lag(C6H6, 24)                            ***
## lag(NO2, 24)                             *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.253 on 7300 degrees of freedom
## Multiple R-squared: 0.8595,  Adjusted R-squared: 0.8591
## F-statistic:  2350 on 19 and 7300 DF, p-value: < 2.22e-16
# fourier K = 2
predict_model <- train_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))
predict_model %>% report()
## Series: TempM 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -13.94545  -2.06177   0.05922   2.22549  11.16144 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                               2.202e+01  3.655e-01  60.262  < 2e-16
## trend()                                  -1.222e-03  9.405e-05 -12.993  < 2e-16
## fourier(period = "1 day", K = 3)C1_24    -2.614e+00  5.814e-02 -44.968  < 2e-16
## fourier(period = "1 day", K = 3)S1_24    -2.949e+00  6.211e-02 -47.475  < 2e-16
## fourier(period = "1 day", K = 3)C2_24     9.943e-01  5.531e-02  17.975  < 2e-16
## fourier(period = "1 day", K = 3)S2_24     1.151e+00  5.864e-02  19.621  < 2e-16
## fourier(period = "1 day", K = 3)C3_24    -2.673e-01  5.461e-02  -4.894 1.01e-06
## fourier(period = "1 day", K = 3)S3_24     1.404e-01  5.506e-02   2.551   0.0108
## fourier(period = "1 year", K = 2)C1_8766 -5.790e+00  2.244e-01 -25.803  < 2e-16
## fourier(period = "1 year", K = 2)S1_8766 -6.492e+00  1.343e-01 -48.356  < 2e-16
## fourier(period = "1 year", K = 2)C2_8766  1.750e+00  9.685e-02  18.070  < 2e-16
## fourier(period = "1 year", K = 2)S2_8766  1.258e+00  8.394e-02  14.992  < 2e-16
## CO                                        1.193e-01  5.907e-02   2.019   0.0435
## C6H6                                      1.798e-01  1.017e-02  17.688  < 2e-16
## NOy                                      -8.341e-03  4.628e-04 -18.021  < 2e-16
## lag(CO, 24)                               8.978e-02  5.656e-02   1.587   0.1124
## lag(C6H6, 24)                             4.022e-02  9.882e-03   4.070 4.76e-05
## lag(NO2, 24)                             -3.950e-03  1.610e-03  -2.453   0.0142
##                                             
## (Intercept)                              ***
## trend()                                  ***
## fourier(period = "1 day", K = 3)C1_24    ***
## fourier(period = "1 day", K = 3)S1_24    ***
## fourier(period = "1 day", K = 3)C2_24    ***
## fourier(period = "1 day", K = 3)S2_24    ***
## fourier(period = "1 day", K = 3)C3_24    ***
## fourier(period = "1 day", K = 3)S3_24    *  
## fourier(period = "1 year", K = 2)C1_8766 ***
## fourier(period = "1 year", K = 2)S1_8766 ***
## fourier(period = "1 year", K = 2)C2_8766 ***
## fourier(period = "1 year", K = 2)S2_8766 ***
## CO                                       *  
## C6H6                                     ***
## NOy                                      ***
## lag(CO, 24)                                 
## lag(C6H6, 24)                            ***
## lag(NO2, 24)                             *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.267 on 7302 degrees of freedom
## Multiple R-squared: 0.8582,  Adjusted R-squared: 0.8578
## F-statistic:  2599 on 17 and 7302 DF, p-value: < 2.22e-16

We end up with a model that includes the concentrations of Carbon Monoxide, Benzene, Nitrogen Oxides 3 and a one day retarded concentration of Carbon Monoxide, Benzene and Nitrogen Dioxide.

Now we use cross-validation as before to compare this model with the one that VAR may predict for us.

summary_TempM <- list()
for (sp in enumerate(cvsplits$splits)) {
    trndat <- training(sp[[2]]) %>% as_tsibble(index = Date)
    tstdat <- testing(sp[[2]])  %>% as_tsibble(index = Date)
    
    var_model  <- trndat %>% 
        model(VAR(vars(TempM, CO, C6H6, NOy, NO2) ~ AR(24) + AR(8766))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts) %>%
        filter(.response == "TempM")
    
    tslm_model <- trndat %>% 
        model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24))) %>% 
        forecast(tstdat) %>% 
        accuracy(data_ts)
    
    summary_TempM[[sp$ix]]        <- bind_rows(var_model, tslm_model)
    summary_TempM[[sp$ix]]$.model <- c("VAR", "TSLM")
}

models_TempM <- bind_rows(summary_TempM) %>%
    select(!c(.type, .response)) %>%
    group_by(.model) %>%
    summarise_all(mean, na.rm = TRUE)

models_TempM %>% 
    mutate(.model_id = seq_len(nrow(.)), .before = .model) %>% 
    rename(.model_desc = .model) %>% 
    relocate(c(MAE, MAPE, MASE, RMSE), .after = .model_desc) %>% 
    select(!c(ME, MPE, RMSSE,  ACF1)) %>% 
    table_modeltime_accuracy(theme = cosmo(), showSortIcon = FALSE, .searchable = FALSE)
print(paste("The best model for Temperature (multi) is",
            models_TempM$.model[which.min(models_TempM$MAE)]))
## [1] "The best model for Temperature (multi) is TSLM"

Now that we have checked that the best model for the temperature is the considered TSLM, we follow by looking at the residuals of the model, which again follow the exact same behaviour we observed for all previous series. The performance on the test partition is kind of similar to the one we observed when no predictors were considered. However, this time there is an slight bend upwards which indicates that the model has captured better the tendency. We may also note this in the slight decrease present in the MAE. This gives us hopes for the future forecasts.

model_TempM <- train_ts %>%
    model(TSLM(TempM ~ trend() + fourier(period = "1 day", K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24)))

model_TempM %>%
    residuals() %>%
    pretty_resid("TempM", title = "Residuals for Temperature (multi)")

f_TempM <- forecast(model_TempM, test_ts)
train_ts %>%
    filter(year(Date) == 2005) %>% 
    autoplot(TempM) +
        autolayer(f_TempM, TempM, color = colors["TempM"]) +
        autolayer(test_ts, TempM, color = "#838B8B", alpha = 0.75) +
        labs(x = "", y = "Temperature (multi) [°C]",
             title = paste("MAE =", round(accuracy(f_TempM, data_ts)$MAE, 4)))

Six month forecasts and conclusions

Finally we end all the work done in this section by using the selected models in a six-month forecast of both, the polluting agents and the two models for the temperature. We note that all of the predictions look reasonable in general terms, especially on the first few months of the forecast. Later, the confidence intervals either explode, giving us little to none information, or the forecasts reach unphysical results.

The latter observation is more prominent for the univariate temperature model. On the other hand, the multivariate model encapsulates the yearly seasonality and provides well reasonable and stable prediction intervals. Hence we claim that by accounting for the possible effects of the different polluting agents, we end with a very reasonable model for future temperature forecast.

# Box-Cox lambdas
lam_CO   <- auto_lambda(data_ts$CO)
lam_C6H6 <- auto_lambda(data_ts$C6H6)
lam_NOy  <- auto_lambda(data_ts$NOy)
lam_NO2  <- auto_lambda(data_ts$NO2)

# Forecast of univariate models
forecast_CO   <- data_ts %>% 
    model(prophet(box_cox(CO, lam_CO) ~ season(period = 24))) %>% 
    forecast(h = "6 months")
forecast_C6H6 <- data_ts %>% 
    model(prophet(box_cox(C6H6, lam_C6H6) ~ season(period = 24))) %>% 
    forecast(h = "6 months")
forecast_NOy  <- data_ts %>% 
    model(prophet(box_cox(NOy, lam_NOy) ~ season(period = 24))) %>% 
    forecast(h = "6 months")
forecast_NO2  <- data_ts %>% 
    model(ARIMA(box_cox(NO2, lam_NO2) ~ 1 + pdq(2, 0, 0) +
                    PDQ(0, 1, 3, period = 24), method = "CSS")) %>% 
    forecast(h = "6 months")
forecast_Temp <- data_ts %>% 
    model(prophet(Temp ~ season(period = 24))) %>% 
    forecast(h = "6 months")

# Forecast of multivariate model
future_data <- new_data(data_ts, n = nrow(forecast_Temp)) %>%
    mutate(CO  = forecast_CO$.mean,  C6H6 = forecast_C6H6$.mean,
           NOy = forecast_NOy$.mean, NO2  = forecast_NO2$.mean)
forecast_TempM <- data_ts %>% 
    model(TSLM(TempM ~ trend() + fourier(period = 24, K = 3) +
                       fourier(period = "1 year", K = 2) + 
                       CO + C6H6 + NOy +
                       lag(CO, 24) + lag(C6H6, 24) + lag(NO2, 24))) %>% 
    forecast(future_data)

# Plot all of them
p1 <- data_ts %>% 
    autoplot(CO) +
    autolayer(forecast_CO,    CO,    color = colors["CO"]) +
    labs(x = "", y = "Carbon Monoxide [mg/m³]")
p2 <- data_ts %>% 
    autoplot(C6H6) +
    autolayer(forecast_C6H6,  C6H6,  color = colors["C6H6"]) +
    labs(x = "", y = "Benzene [μg/m³]")
p3 <- data_ts %>% 
    autoplot(NOy) +
    autolayer(forecast_NOy,   NOy,   color = colors["NOy"]) +
    labs(x = "", y = "Nitrogen Oxides [ppb]") +
    coord_cartesian(ylim = c(NA, 1500))
p4 <- data_ts %>% 
    autoplot(NO2) +
    autolayer(forecast_NO2,   NO2,   color = colors["NO2"]) +
    labs(x = "", y = "Nitrogen Dioxide [μg/m³]")
p5 <- data_ts %>% 
    autoplot(Temp) +
    autolayer(forecast_Temp,  Temp,  color = colors["Temp"]) +
    labs(x = "", y = "Temperature [°C]")
p6 <- data_ts %>% 
    autoplot(TempM) +
    autolayer(forecast_TempM, TempM, color = colors["TempM"]) +
    labs(x = "", y = "Temperature (multi) [°C]")
p1 / p2

p3 / p4

p5 / p6

Machine Learning

Now we come to the advanced methods, where we will make use of Machine Learning (ML) techniques in order to forecast the temperature. We will not be agnostic to the previous findings and, as such, we will make use of all the acquired knowledge for this section. This means that we will consider the relations between the temperature and the rest of the polluting agents.

Let us thus outline the structure of this section. First, we will try different methods for the description of the temperature with dependence on the concentration of the gases in the atmosphere. We will make the same forecast as we did with the simple methods and compare whether ML brings forth improvements or not on the forecasting of the temperature.

We are concious about the necessity of hyperparameter tuning. Nonetheless, we are constrained here by computational performance which is highly demanding in this process. As we will see later, sticking with the defaults does not result in a punishing decision.

Following the usual workflow provided by the tidymodels API, we build a recipe in which we will consider daily, monthly and yearly seasonality. We will consider a total of seven models: Elastic Net, Random Forest, XGBoost, NNetar (neural network), Prophet (boost and regression) and boost ARIMA.

# Load here to avoid errors with previous parts
require(tidymodels)         || i.p("tidymodels")         && require(tidymodels)
require(modeltime.resample) || i.p("modeltime.resample") && require(modeltime.resample)
require(modeltime.ensemble) || i.p("modeltime.ensemble") && require(modeltime.ensemble)

recipe_spec <- recipe(TempM ~ Date + CO + C6H6 + NOy + NO2,
                      training(traintest)) %>%
    step_date(Date, features = c("doy", "month", "year")) %>%
    step_dummy(all_nominal()) %>%
    step_fourier(Date, K = 3, period = 24)

# Train models in train partition
models_ml <- modeltime_table(
    workflow() %>%
        add_model(linear_reg(penalty = 0.01, mixture = 0.5) %>%
                      set_engine("glmnet")) %>%
        add_recipe(recipe_spec %>% step_rm(Date)) %>% 
        fit(training(traintest)),
    workflow() %>%
        add_model(rand_forest(mode = "regression") %>% 
                      set_engine("randomForest")) %>% 
        add_recipe(recipe_spec %>% step_rm(Date)) %>% 
        fit(training(traintest)),
    workflow() %>%
        add_model(boost_tree(mode = "regression") %>% 
                      set_engine("xgboost")) %>% 
        add_recipe(recipe_spec %>% step_rm(Date)) %>% 
        fit(training(traintest)),
    workflow() %>% 
        add_model(nnetar_reg(seasonal_period = 24) %>%
                      set_engine("nnetar")) %>% 
        add_recipe(recipe_spec) %>% 
        fit(training(traintest)),
    workflow() %>%
        add_model(prophet_boost(seasonality_daily = TRUE) %>%
                      set_engine("prophet_xgboost")) %>% 
        add_recipe(recipe_spec) %>% 
        fit(training(traintest)),
    workflow() %>%
        add_model(arima_boost(seasonal_period = 24) %>%
                      set_engine("arima_xgboost")) %>% 
        add_recipe(recipe_spec) %>% 
        fit(training(traintest)),
    workflow() %>%
        add_model(prophet_reg(seasonality_daily = TRUE) %>%
                      set_engine("prophet")) %>% 
        add_recipe(recipe_spec) %>% 
        fit(training(traintest)))

We will perform time series cross validation on each of the models. In the same fashion as we did before, only that more elegantly with this API, we will average the different accuracy measures along the five splits.

This time, however, we will take a different approach with selecting the best model. As we are only treating with one time series and with much more complex models, we expect that when we retrain the model, the most performant one will no longer be. Nonetheless, we do expect that the best three models remain so after considering the whole dataset. Hence, we will on the following keep the three models with lower MAE among the seven.

# Use CV on models
models_resample <- models_ml %>%
    modeltime_fit_resamples(resamples = cvsplits,
                            control   = control_resamples(verbose = FALSE))

# Make the plot
models_resample %>%
    plot_modeltime_resamples(.summary_fn  = mean, .point_size  = 3)
# Show accuracy table
accuracy_resample <- models_resample %>%
    modeltime_resample_accuracy(summary_fns = list(mean = mean)) %>% 
    rename_with(~ toupper(gsub("_mean", "", .x)), .cols = !starts_with(".")) %>% 
    select(!c(.type, N, SMAPE, RSQ))

accuracy_resample %>%
    table_modeltime_accuracy(theme = cosmo(), .show_sortable = FALSE, .searchable = FALSE)
# Print the best
best_models <- accuracy_resample$.model_desc[order(accuracy_resample$MAE)[1:3]]
print(paste0("The best three models are ", best_models[1], 
             ", ", best_models[2], " and ", best_models[3]))
## [1] "The best three models are PROPHET W/ XGBOOST ERRORS, XGBOOST and RANDOMFOREST"

Once we have our ensemble of models identified, we are ready to test them against the train partition to get an idea of the accuracy we are to expect from them. In this case, both, Random Forest and XGBoost did an exceptional job, with the latter taking the edge, reaching a MAE which stands as a forth of the one obtained with simple models. We can also see this visually in the plot. Please note the plot is interactive and layers may be visualised as desired.

# Get three best
best_ml <- models_ml %>% 
    filter((.model_desc == best_models[1]) | 
               (.model_desc == best_models[2]) | 
               (.model_desc == best_models[3]))

# Calibrate with test
calibrated_ml <- best_ml %>%
    modeltime_calibrate(testing(traintest))

# Plot the forecasts
calibrated_ml %>%
    modeltime_forecast(new_data = testing(traintest), actual_data = as_tibble(data_ts)) %>%
    plot_modeltime_forecast(.legend_max_width = 25)
# Accuracy table
calibrated_ml %>%
    modeltime_accuracy(metric_set = extended_forecast_accuracy_metric_set()) %>% 
    rename_with(~ toupper(.x), .cols = !starts_with(".")) %>% 
    mutate(MAPE = MAAPE) %>% 
    select(!c(.type, MAAPE, SMAPE, RSQ)) %>% 
    table_modeltime_accuracy(theme = cosmo(), .show_sortable = FALSE, .searchable = FALSE)

At last, we forecast for half a year. Immediately we see that the produced forecast by XGBoost and RandomForest, which we identified in the holdout test as the best performing, is way more realistic and detailed that the one we got from simple modelling. This speaks in favour of advanced ML models which are able to adapt much more, which is fundamental for our hourly data, but also to the fundamental importance of taking in account the presence of polluting agents for temperature forecast.

final_ml <- calibrated_ml %>%
    modeltime_refit(data = as_tibble(data_ts))

final_ml %>%
    modeltime_forecast(new_data    = as_tibble(future_data),
                       actual_data = as_tibble(data_ts)) %>%
    plot_modeltime_forecast(.legend_max_width = 25)

As a final addition, we plot the mean of the forecast using an ensemble model

ensemble_ml <- final_ml %>%
    ensemble_average(type = "mean")

ensemble_ml %>%
    modeltime_calibrate(testing(traintest)) %>%
    modeltime_accuracy(metric_set = extended_forecast_accuracy_metric_set()) %>% 
    rename_with(~ toupper(.x), .cols = !starts_with(".")) %>% 
    mutate(MAPE = MAAPE) %>% 
    select(!c(.type, MAAPE, SMAPE, RSQ)) %>% 
    table_modeltime_accuracy(theme = cosmo(), .show_sortable = FALSE, .searchable = FALSE)
ensemble_ml %>%
    modeltime_table() %>% 
    modeltime_forecast(new_data    = as_tibble(future_data),
                       actual_data = as_tibble(data_ts)) %>%
    plot_modeltime_forecast(.legend_max_width = 25)

Conclusions

Throughout the entire duration of the project, we have diligently analysed various time series data of hourly frequency, and we have been able to effectively describe and forecast them. Our primary objective throughout this endeavour was to conduct research on the significance of polluting agents in short-term temperature forecasting. In contrast to our previous assumptions, we have observed unmistakable indications that the inclusion of these agents greatly enhances the accuracy of our forecasts.

Furthermore, we have explored Machine Learning techniques and succeeded in refining our forecasts to capture intricate monthly and daily seasonal patterns, while continuing to emphasize the fundamental role that polluting gases play.

We view the outcome of this project as a resounding success, not only because we have been able to achieve our goal of producing realistic forecasts, but also because we have gained valuable insight into the critical importance of our emissions to the climate and the pressing need for us to actively prioritize the preservation of our planet.


  1. Inner testing showed that Prophet, ETS and ARIMA tend to give null models frequently when adding predictors. Moreover, the two model considered here offer a very neat way of knowing whether the considered variables are indeed related with the temperature or, as in the case of VAR, provide a feature selection on in itself.↩︎

  2. This is not to mean that concentrations are not going to be correlated, which they are going to be, but rather that the high concentration of one agent should not produce an effect on the rest, as the relation should lie in both having the same emit channels.↩︎

  3. Remember that this variable excludes (to a first approximation) the contributions due to Nitrogen Dioxide.↩︎